<img src="../imgs/RNN-rolled.png"/ width="80px" height="80px">
<img src="../imgs/RNN-unrolled.png"/ width="400px" height="400px">
<img src="../imgs/LSTM3-chain.png"/ width="60%">
In [ ]:
from keras.optimizers import SGD
from keras.preprocessing.text import one_hot, text_to_word_sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence
In [ ]:
import os
import pickle
import numpy as np
In [ ]:
DATA_DIRECTORY = os.path.join(os.path.abspath(os.path.curdir), '..', 'data', 'word_embeddings')
print(DATA_DIRECTORY)
In [ ]:
male_posts = []
female_post = []
In [ ]:
with open(os.path.join(DATA_DIRECTORY,"male_blog_list.txt"),"rb") as male_file:
male_posts= pickle.load(male_file)
with open(os.path.join(DATA_DIRECTORY,"female_blog_list.txt"),"rb") as female_file:
female_posts = pickle.load(female_file)
In [ ]:
filtered_male_posts = list(filter(lambda p: len(p) > 0, male_posts))
filtered_female_posts = list(filter(lambda p: len(p) > 0, female_posts))
In [ ]:
# text processing - one hot builds index of the words
male_one_hot = []
female_one_hot = []
n = 30000
for post in filtered_male_posts:
try:
male_one_hot.append(one_hot(post, n, split=" ", lower=True))
except:
continue
for post in filtered_female_posts:
try:
female_one_hot.append(one_hot(post,n,split=" ", lower=True))
except:
continue
In [ ]:
# 0 for male, 1 for female
concatenate_array_rnn = np.concatenate((np.zeros(len(male_one_hot)),
np.ones(len(female_one_hot))))
In [ ]:
from sklearn.model_selection import train_test_split
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(np.concatenate((female_one_hot,male_one_hot)),
concatenate_array_rnn,
test_size=0.2)
In [ ]:
maxlen = 100
X_train_rnn = sequence.pad_sequences(X_train_rnn, maxlen=maxlen)
X_test_rnn = sequence.pad_sequences(X_test_rnn, maxlen=maxlen)
print('X_train_rnn shape:', X_train_rnn.shape, y_train_rnn.shape)
print('X_test_rnn shape:', X_test_rnn.shape, y_test_rnn.shape)
In [ ]:
max_features = 30000
dimension = 128
output_dimension = 128
model = Sequential()
model.add(Embedding(max_features, dimension))
model.add(LSTM(output_dimension))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
In [ ]:
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy'])
In [ ]:
model.fit(X_train_rnn, y_train_rnn, batch_size=32,
epochs=4, validation_data=(X_test_rnn, y_test_rnn))
In [ ]:
score, acc = model.evaluate(X_test_rnn, y_test_rnn, batch_size=32)
In [ ]:
print(score, acc)
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [ ]:
vectorizer = TfidfVectorizer(decode_error='ignore', norm='l2', min_df=5)
tfidf_male = vectorizer.fit_transform(filtered_male_posts)
tfidf_female = vectorizer.fit_transform(filtered_female_posts)
In [ ]:
flattened_array_tfidf_male = tfidf_male.toarray()
flattened_array_tfidf_female = tfidf_male.toarray()
In [ ]:
y_rnn = np.concatenate((np.zeros(len(flattened_array_tfidf_male)),
np.ones(len(flattened_array_tfidf_female))))
In [ ]:
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(np.concatenate((flattened_array_tfidf_male,
flattened_array_tfidf_female)),
y_rnn,test_size=0.2)
In [ ]:
maxlen = 100
X_train_rnn = sequence.pad_sequences(X_train_rnn, maxlen=maxlen)
X_test_rnn = sequence.pad_sequences(X_test_rnn, maxlen=maxlen)
print('X_train_rnn shape:', X_train_rnn.shape, y_train_rnn.shape)
print('X_test_rnn shape:', X_test_rnn.shape, y_test_rnn.shape)
In [ ]:
max_features = 30000
model = Sequential()
model.add(Embedding(max_features, dimension))
model.add(LSTM(output_dimension))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
In [ ]:
model.compile(loss='mean_squared_error',optimizer='sgd', metrics=['accuracy'])
In [ ]:
model.fit(X_train_rnn, y_train_rnn,
batch_size=32, epochs=1,
validation_data=(X_test_rnn, y_test_rnn))
In [ ]:
score,acc = model.evaluate(X_test_rnn, y_test_rnn,
batch_size=32)
In [ ]:
print(score, acc)
In [ ]:
# reading all the male text data into one string
male_post = ' '.join(filtered_male_posts)
#building character set for the male posts
character_set_male = set(male_post)
#building two indices - character index and index of character
char_indices = dict((c, i) for i, c in enumerate(character_set_male))
indices_char = dict((i, c) for i, c in enumerate(character_set_male))
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 20
step = 1
sentences = []
next_chars = []
for i in range(0, len(male_post) - maxlen, step):
sentences.append(male_post[i : i + maxlen])
next_chars.append(male_post[i + maxlen])
In [ ]:
#Vectorisation of input
x_male = np.zeros((len(male_post), maxlen, len(character_set_male)), dtype=np.bool)
y_male = np.zeros((len(male_post), len(character_set_male)), dtype=np.bool)
print(x_male.shape, y_male.shape)
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
x_male[i, t, char_indices[char]] = 1
y_male[i, char_indices[next_chars[i]]] = 1
print(x_male.shape, y_male.shape)
In [ ]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(character_set_male))))
model.add(Dense(len(character_set_male)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
In [ ]:
auto_text_generating_male_model.compile(loss='mean_squared_error',optimizer='sgd')
In [ ]:
import random, sys
In [ ]:
# helper function to sample an index from a probability array
def sample(a, diversity=0.75):
if random.random() > diversity:
return np.argmax(a)
while 1:
i = random.randint(0, len(a)-1)
if a[i] > random.random():
return i
In [ ]:
# train the model, output generated text after each iteration
for iteration in range(1,10):
print()
print('-' * 50)
print('Iteration', iteration)
model.fit(x_male, y_male, batch_size=128, epochs=1)
start_index = random.randint(0, len(male_post) - maxlen - 1)
for diversity in [0.2, 0.4, 0.6, 0.8]:
print()
print('----- diversity:', diversity)
generated = ''
sentence = male_post[start_index : start_index + maxlen]
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
for iteration in range(400):
try:
x = np.zeros((1, maxlen, len(character_set_male)))
for t, char in enumerate(sentence):
x[0, t, char_indices[char]] = 1.
preds = model.predict(x, verbose=0)[0]
next_index = sample(preds, diversity)
next_char = indices_char[next_index]
generated += next_char
sentence = sentence[1:] + next_char
except:
continue
print(sentence)
print()